import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statistics as stats
from sklearn import preprocessing
import plotly.express as px
from string import ascii_letters
Based on the assessment description, the assumption for this excercise is that 'train.csv' is the only data available the Junior Data Scientist has with labels. This data will be used to train and validate the model being built.
Once a model is selected, the 'test.csv' will be viewed as 'market data' from real applicants that need to be verified for loan eligibility through my proposed automated process.
customer_info = pd.read_csv('../data/train.csv')
# create copy of customer_info for cleaning
customer_info_cleaned = customer_info.copy()
# drop Loan_ID column
customer_info_cleaned.drop(columns=['Loan_ID'], inplace=True)
'Loan_ID' is not relevant for analysis or prediciting loan status. In fact, this column could make our model think there is an associated order with each record.
We don't necessarily know enough about the 'Credit_History' column or what it is telling us. Upon an initial look at the data, it is clearly not credit scores, but binary values. So, we will explore the values by Eligible and Not Eligible applicants to understand what the values could mean.
x = customer_info_cleaned['Credit_History'].loc[customer_info_cleaned['Loan_Status']=='Y']
y = customer_info_cleaned['Credit_History'].loc[customer_info_cleaned['Loan_Status']=='N']
plt.hist(x, label='Eligible')
plt.hist(y, label='Not Eligible')
plt.legend(loc='upper right')
plt.title("Distribution of Credit History for Loan Applicants")
plt.ylabel("Number of Applicants")
plt.xlabel("Credit History")
plt.show()
Based on these results, we can see that all eligible loan applicants fall under 1. We can assume this to mean that there is a credit history, whereas 0 denotes no credit history. Based on the division of this value between Eligible and Not Eligible applicants, we will keep this feature for data exploration, summary, and modeling purposes
customer_info.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 614 entries, 0 to 613 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Loan_ID 614 non-null object 1 Gender 601 non-null object 2 Married 611 non-null object 3 Dependents 599 non-null object 4 Education 614 non-null object 5 Self_Employed 582 non-null object 6 ApplicantIncome 614 non-null int64 7 CoapplicantIncome 614 non-null float64 8 LoanAmount 592 non-null float64 9 Loan_Amount_Term 600 non-null float64 10 Credit_History 564 non-null float64 11 Property_Area 614 non-null object 12 Loan_Status 614 non-null object dtypes: float64(4), int64(1), object(8) memory usage: 62.5+ KB
We notice that a feature with 614 non-null records indicates a column with no null values. We notice some of our features, like Gender, have values below 614 returned non-null records. However, it does not look like any significant amount of data (ex: >50%) is missing for any of our features.
# explore total number of null values for each feature
customer_info.isnull().sum()
Loan_ID 0 Gender 13 Married 3 Dependents 15 Education 0 Self_Employed 32 ApplicantIncome 0 CoapplicantIncome 0 LoanAmount 22 Loan_Amount_Term 14 Credit_History 50 Property_Area 0 Loan_Status 0 dtype: int64
There are a few ways we can handle these nulls. To begin, we will drop any rows with null values for features where the number of null values is relatively insignificant (< 3%) of the total rows in our data.
# create function to drop any rows with nulls values for features where the number of null values is <3% of the total rows in our data
def drop_nas(df):
for feature in df.columns:
null_sum = df[feature].isnull().sum()
if null_sum < len(df) * .03 and null_sum != 0:
df.dropna(subset = [feature], inplace = True)
return df
# run drop_nas on clean copy of customer_info
customer_info_cleaned = drop_nas(customer_info_cleaned)
# verify results
customer_info_cleaned.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 573 entries, 0 to 613 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 573 non-null object 1 Married 573 non-null object 2 Dependents 573 non-null object 3 Education 573 non-null object 4 Self_Employed 542 non-null object 5 ApplicantIncome 573 non-null int64 6 CoapplicantIncome 573 non-null float64 7 LoanAmount 553 non-null float64 8 Loan_Amount_Term 573 non-null float64 9 Credit_History 524 non-null float64 10 Property_Area 573 non-null object 11 Loan_Status 573 non-null object dtypes: float64(4), int64(1), object(7) memory usage: 58.2+ KB
Based on the train data, these results are near what we would expect.
The features that contained <3% of the original 614 rows were 'Gender' (13 null records), Married (3 null records), 'Dependents' (15 null records), and 'Loan_Amount_term' (14 null records).
We can assume some of the nulls for these features happened within the same record. We still have 573 rows, which we are happy with because we are well over the minimum '10 records per feature' rule of thumb for a dataframe that will be used for modeling.
# check count for remaining nulls
customer_info_cleaned.isnull().sum()
Gender 0 Married 0 Dependents 0 Education 0 Self_Employed 31 ApplicantIncome 0 CoapplicantIncome 0 LoanAmount 20 Loan_Amount_Term 0 Credit_History 49 Property_Area 0 Loan_Status 0 dtype: int64
For the remaining features with nulls noted above, I will impute their value with a measure of central tendency. Before deciding which measure to use, I will look at the spread of the data. I will also considering the value in the 'Loan_Status' column, because if the two groups' data behaves differently, I will impute based on this feature's value.
However, we first need to check that the data types of our features are integers or floats in order to run these statistics. And if not, change as necessary.
# check dtypes
customer_info_cleaned.dtypes
Gender object Married object Dependents object Education object Self_Employed object ApplicantIncome int64 CoapplicantIncome float64 LoanAmount float64 Loan_Amount_Term float64 Credit_History float64 Property_Area object Loan_Status object dtype: object
The columns that are type object are all columns that should have their values encoded.
# create function to encode object values using label_encoder
# leave null values for now until we impute their values
def column_encoder(data):
# initiate and define label_encoder
label_encoder = preprocessing.LabelEncoder()
# user label_encoder on features of interest
data['Gender'].loc[data['Gender'].notnull()] = label_encoder.fit_transform(data['Gender'].loc[data['Gender'].notnull()])
data['Married'].loc[data['Married'].notnull()] = label_encoder.fit_transform(data['Married'].loc[data['Married'].notnull()])
data['Dependents'].loc[data['Dependents'].notnull()] = label_encoder.fit_transform(data['Dependents'].loc[data['Dependents'].notnull()])
data['Education'].loc[data['Education'].notnull()] = label_encoder.fit_transform(data['Education'].loc[data['Education'].notnull()])
data['Self_Employed'].loc[data['Self_Employed'].notnull()] = label_encoder.fit_transform(data['Self_Employed'].loc[data['Self_Employed'].notnull()])
data['Property_Area'].loc[data['Property_Area'].notnull()] = label_encoder.fit_transform(data['Property_Area'].loc[data['Property_Area'].notnull()])
data['Loan_Status'].loc[data['Loan_Status'].notnull()] = label_encoder.fit_transform(data['Loan_Status'].loc[data['Loan_Status'].notnull()])
return data
# run function
customer_info_cleaned = column_encoder(customer_info_cleaned)
C:\Users\asliw\AppData\Local\Temp\ipykernel_7144\4257256577.py:7: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['Gender'].loc[data['Gender'].notnull()] = label_encoder.fit_transform(data['Gender'].loc[data['Gender'].notnull()]) C:\Users\asliw\AppData\Local\Temp\ipykernel_7144\4257256577.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['Married'].loc[data['Married'].notnull()] = label_encoder.fit_transform(data['Married'].loc[data['Married'].notnull()]) C:\Users\asliw\AppData\Local\Temp\ipykernel_7144\4257256577.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['Dependents'].loc[data['Dependents'].notnull()] = label_encoder.fit_transform(data['Dependents'].loc[data['Dependents'].notnull()]) C:\Users\asliw\AppData\Local\Temp\ipykernel_7144\4257256577.py:10: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['Education'].loc[data['Education'].notnull()] = label_encoder.fit_transform(data['Education'].loc[data['Education'].notnull()]) C:\Users\asliw\AppData\Local\Temp\ipykernel_7144\4257256577.py:11: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['Self_Employed'].loc[data['Self_Employed'].notnull()] = label_encoder.fit_transform(data['Self_Employed'].loc[data['Self_Employed'].notnull()]) C:\Users\asliw\AppData\Local\Temp\ipykernel_7144\4257256577.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['Property_Area'].loc[data['Property_Area'].notnull()] = label_encoder.fit_transform(data['Property_Area'].loc[data['Property_Area'].notnull()]) C:\Users\asliw\AppData\Local\Temp\ipykernel_7144\4257256577.py:13: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['Loan_Status'].loc[data['Loan_Status'].notnull()] = label_encoder.fit_transform(data['Loan_Status'].loc[data['Loan_Status'].notnull()])
Results of the column_encoder function are:
'Gender': Female = 0, Male = 1
'Married': No = 0, Yes = 1
'Dependents': 0 = 0, 1 = 1, 2 = 2, 3+ = 3
'Education': Graduate = 0, Not Graduate = 1
'Self Employed': No = 0, Yes = 1
'Property_Area': Rural = 0, Semi-Urban = 1, Urban = 2
'Loan_Status': N = 0, Y = 1
I chose Label Encoding because it encodes in alphabetical order and most of our columns only contain 2 categories. However, 'Dependents' does actually have an associated order of dependents claimed on a customer application, which the label encoder preserves. Additionally, 'Property Area' has an implied order of population density. Having rural as the lowest value and Urban as highest is a good representation for this data.
# dtypes are still objects, change to int
columns = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
customer_info_cleaned[columns] = customer_info_cleaned[columns].apply(pd.to_numeric, errors='coerce')
# impute null values for 'Self_Employed' based on mode for 'Loan_Status' 1 and 0
# mode is chosen because this data is categorical in nature
# mode for non-eligible applicants
print(customer_info_cleaned['Self_Employed'].loc[customer_info_cleaned['Loan_Status'] == 0].mode())
# mode for eligible applicants
print(customer_info_cleaned['Self_Employed'].loc[customer_info_cleaned['Loan_Status'] == 1].mode())
0 0.0 Name: Self_Employed, dtype: float64 0 0.0 Name: Self_Employed, dtype: float64
x = customer_info_cleaned['Self_Employed'].loc[customer_info_cleaned['Loan_Status']==1]
y = customer_info_cleaned['Self_Employed'].loc[customer_info_cleaned['Loan_Status']==0]
plt.hist(x, label='Eligible')
plt.hist(y, label='Not Eligible')
plt.legend(loc='upper right')
plt.title("Distribution of Self Employed Home Loan Applicants")
plt.ylabel("Number of Applicants")
plt.xlabel("Not Self Employed (0) or Self Employed (1)")
plt.show()
The mode is 0, or not self-employed for both eligible and non-eligible home loan applicants. So, we will impute any nulls for this column with 0.
According to our histogram, only ~17% of the data contains applicants who are self-employed.
# for both elibigle and non-eligible loan applicants, change Self_Employed nulls to 0
customer_info_cleaned['Self_Employed'].fillna(0, inplace=True)
Now, we will look at spread for 'LoanAmount' to determine the best measure of central tendency to impute nulls with.
# look at distribution for 'LoanAmount' by eligibility
x = customer_info_cleaned['LoanAmount'].loc[customer_info_cleaned['Loan_Status']==1]
y = customer_info_cleaned['LoanAmount'].loc[customer_info_cleaned['Loan_Status']==0]
plt.hist(x, alpha=0.5,label='Eligible')
plt.hist(y, alpha=0.5,label='Not Eligible')
plt.legend(loc='upper right')
plt.title('Loan Amount Requested by Applicant Eligibility')
plt.xlabel('Loan Amount (thousands, USD)')
plt.ylabel('Frequency')
plt.show()
Key insights from the histograms above are:
Median will be the best measure of central tendency as this data is skewed and is not categorical.
# determine if we need to break median out by Eligible or Not Eligible groups
loan_amount_median = customer_info_cleaned['LoanAmount'].median()
loan_amount_not_eligible_median = customer_info_cleaned['LoanAmount'].loc[customer_info_cleaned['Loan_Status']==0].median()
loan_amount_eligible_median = customer_info_cleaned['LoanAmount'].loc[customer_info_cleaned['Loan_Status']==1].median()
print(loan_amount_median)
print(loan_amount_not_eligible_median)
print(loan_amount_eligible_median)
128.0 130.5 126.0
The medians are quite close for these the eligible and not eligible groups, so we can use the column calculated for the entire column when imputing nulls.
# impute nulls for 'LoanAmount' with median for Not Eligible and Eligible applicants
# for eligible applicants
loan_amount_eligible_median = customer_info_cleaned['LoanAmount'].median()
customer_info_cleaned['LoanAmount'].fillna(loan_amount_median, inplace=True)
Lastly, for our columns with nulls, we will look at spread for 'Credit_History' to determine the best measure of central tendency to impute nulls with.
At the beginning of this notebook, when exploring unecessary columns to remove, we reviewed the distribution of this column and discovered the results vary (1 or 0) for only the not eligible applicants. All eligible applicants have a value of 1 for credit history.
So, we will impute all missing 'Credit_History' values with 1 for eligible applicants, but review a bit further the breakdown of not eligible applicant values to guide how to impute this group's nulls.
# review how many Not Eligible applicants are 1 or 0 for 'Credit_History' for not nulls
not_eligible_credit_history = customer_info_cleaned['Credit_History'].loc[customer_info_cleaned['Loan_Status'] == 0 & customer_info_cleaned['Credit_History'].notnull()]
y = not_eligible_credit_history.value_counts()
plt.pie(y, labels=["1",'0'], autopct='%.1f%%')
plt.show()
As suspected from our histogram for this column at the beginning of our notebook, 1 is the mode for not eligible candidates as well. The value 1 appears 16% more in the not eligible applicant group, so we will use this value to imput our not eligible candidate nulls. The credit history mode for both applicant groups is 1.
# fill all nulls within 'Credit_History' as 1
customer_info_cleaned['Credit_History'].fillna(1, inplace=True)
All of the nulls in our data are now accounted for!
Let's take a lot at outliers in our data.
The following process is relevant for our non categorical variables:
# pairplot for non categorical features
sns.pairplot(customer_info_cleaned, vars = customer_info_cleaned.columns[5:9],hue="Loan_Status")
<seaborn.axisgrid.PairGrid at 0x1abc3d8bac0>
'ApplicantIncome'
'CoapplicantIncome'
LoanAmount
Loan_Amount_Term
# describe 'ApplicantIncome'
customer_info_cleaned.describe()[['ApplicantIncome']]
| ApplicantIncome | |
|---|---|
| count | 573.000000 |
| mean | 5328.514834 |
| std | 5909.329219 |
| min | 150.000000 |
| 25% | 2876.000000 |
| 50% | 3812.000000 |
| 75% | 5815.000000 |
| max | 81000.000000 |
# boxplot for 'ApplicantIncome'
green_diamond = dict(markerfacecolor='g', marker='D')
fig1, ax1 = plt.subplots()
ax1.set_title('Applicant Income')
ax1.boxplot(customer_info_cleaned['ApplicantIncome'], flierprops=green_diamond)
plt.plot()
[]
There are some really large values here that could seem suspiscious.
Values that appear outliers on our graph between 10,000 and 25,000 a month seem more realisitc in terms of annual income, whereas 80,000 a month feels more extreme. However, there is no cap to income and we would expect to see applicants of diverse background.
Let's do a bivariate scatterplot with the 'LoanAmount' column to see if outliers for each feature occur within the same record, which could suggest or support that these high income values are actually valid.
# create scatterplot of 'ApplicantIncome' and 'LoanAmount'
fig = px.scatter(customer_info_cleaned, x='ApplicantIncome', y='LoanAmount', color='Loan_Status')
fig.show()
# run description and boxplot of LoanAmount to understand what are considered outliers for this feature
# description
print(customer_info_cleaned.describe()[['LoanAmount']])
#boxplot
green_diamond = dict(markerfacecolor='g', marker='D')
fig1, ax1 = plt.subplots()
ax1.set_title('Loan Amount Applied for')
ax1.boxplot(customer_info_cleaned['LoanAmount'], flierprops=green_diamond)
plt.plot()
LoanAmount count 573.000000 mean 145.373473 std 82.635712 min 9.000000 25% 100.000000 50% 128.000000 75% 165.000000 max 650.000000
[]
In our scatterplot, if Applicant Income goes above extreme outliers of 30,000+, the associated loan amounts are sometimes outliers or at least majority above the 50th percentile. Given this information and the positive relationship identified between the two variables, there is evidence these are valid data points. Another fact to consider is, even if someone does have high income, it is still reasonable for them to ask for a loan of any value.
Additionally, for the less extreme applicant income outliers, like 20,000, there does seem to be a general trend for loan amounts requested to be higher (480,000 to 650,000). This is as expected because folks with higher income are likely seeking to purchase more expensive homes.
With this is mind, there is not enough evidence to prove these are erroneous records. We will keep all values for both columns.
Lastly, there are not many application for loans > 300,000. For majority of applications about this amount, applicant income is also above 100,000 USD per year. Using information from articles like this (https://www.linkedin.com/pulse/what-income-needed-500k-mortgage-pierre-carapetian/), it is noted that at least 113,000 USD is required as annual income to afford a 500,000 home loan. The scatterplot above depicts this reality of higher income for higher loan amount applied for, even with time from the article and inflation since considered.
# run description for 'CoapplicantIncome' to confirm there are no odd numbers
customer_info_cleaned.describe()[['CoapplicantIncome']]
| CoapplicantIncome | |
|---|---|
| count | 573.000000 |
| mean | 1641.359372 |
| std | 3001.139055 |
| min | 0.000000 |
| 25% | 0.000000 |
| 50% | 1210.000000 |
| 75% | 2302.000000 |
| max | 41667.000000 |
# create histogram for 'CoapplicantIncome'
customer_info_cleaned['CoapplicantIncome'].hist(bins=20)
<AxesSubplot:>
The data is skewed to the right, with only a potentially erroneous datapoints that are > 10,000 USD
# look at the full records for CoapplicantIncome > 10000
customer_info_cleaned.loc[customer_info_cleaned['CoapplicantIncome'] > 10000]
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome | CoapplicantIncome | LoanAmount | Loan_Amount_Term | Credit_History | Property_Area | Loan_Status | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 1 | 1 | 1 | 0 | 0.0 | 12841 | 10968.0 | 349.0 | 360.0 | 1.0 | 1 | 0 |
| 177 | 1 | 1 | 3 | 0 | 0.0 | 5516 | 11300.0 | 495.0 | 360.0 | 0.0 | 1 | 0 |
| 402 | 1 | 0 | 0 | 0 | 0.0 | 2500 | 20000.0 | 103.0 | 360.0 | 1.0 | 1 | 1 |
| 417 | 1 | 1 | 2 | 0 | 1.0 | 1600 | 20000.0 | 239.0 | 360.0 | 1.0 | 2 | 0 |
| 581 | 1 | 0 | 0 | 0 | 0.0 | 1836 | 33837.0 | 90.0 | 360.0 | 1.0 | 2 | 0 |
| 600 | 0 | 0 | 3 | 0 | 0.0 | 416 | 41667.0 | 350.0 | 180.0 | 1.0 | 2 | 0 |
These points all seem valid because there are no odd patterns like all 0s for many of the non-categorical values. We will keep all data points because, as mentioned before, there is no cap to income, applicants can make any amount of money.
Interestingly enough, we notice that as coapplicant income rises, applicant income is relatively lower. This makes sense as people with lower incomes need a coapplicant with sufficient capital to make their application attractive for lenders.
Additionally, when Coapplicant Income is 0, we can assume there was no coapplicant on the home loan application.
# run description for 'Loan_Amount_Term'
customer_info_cleaned.describe()[['Loan_Amount_Term']]
| Loan_Amount_Term | |
|---|---|
| count | 573.000000 |
| mean | 341.675393 |
| std | 65.186012 |
| min | 12.000000 |
| 25% | 360.000000 |
| 50% | 360.000000 |
| 75% | 360.000000 |
| max | 480.000000 |
This description of the data is extremely reasonable for loan terms. As we saw in the pairplot, a large majority of loans hover around 360 months (30 years), but there are cases where they are only a year, or even 40 years.
Now that we have a better feel for the data, it is clear some of the titles could use cleaning, so users know how to best interpret column values.
customer_info_cleaned.rename(columns = {'ApplicantIncome':'ApplicantIncome_Monthly', 'CoapplicantIncome':'CoapplicantIncome_Monthly',
'LoanAmount':'LoanAmount_Thousands', 'Loan_Amount_Term':'Loan_Amount_Term_Months'}, inplace = True)
def pie_charts(df,columns):
for i in columns:
feature_unique_vals = df[i].unique()
for x in feature_unique_vals:
y = df['Loan_Status'].loc[df[i]==x].value_counts()
colors = ['lightcoral', 'lightskyblue']
plt.pie(y,labels=y.index, autopct='%.1f%%',colors=colors)
plt.legend(y.index)
plt.title(f"Loan eligibility by {i} value {x}")
plt.show()
a = list(customer_info_cleaned.columns[0:5])
b = list(customer_info_cleaned.columns[9:11])
c = a+b
pie_charts(customer_info_cleaned, c)
Reminder of encoded values:
'Gender': Female = 0, Male = 1
'Married': No = 0, Yes = 1
'Dependents': 0 = 0, 1 = 1, 2 = 2, 3+ = 3
'Education': Graduate = 0, Not Graduate = 1
'Self Employed': No = 0, Yes = 1
'Property_Area': Rural = 0, Semi-Urban = 1, Urban = 2
'Loan_Status': N = 0, Y = 1
Summary of pie charts:
Gender
Married
Dependents
Education
Self Employed
Credit History
Property Area
# understand correlatins between all variables
sns.set_theme(style="white")
# Compute the correlation matrix
corr = customer_info_cleaned.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(16, 12))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, mask=mask, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot= True)
<AxesSubplot:>
As we saw in our pairplot earlier, applicant income and loan amount have a strong positive correlation. It turns out these are the variables with the strongest correlation in the data. Followed closely by credit history and loan status. It makes sense that this correlation is positive because 1 means there is a credit history in our data as well as 1 for loan status means the applicant is eligible.
As we also saw in our pie charts, education has a relationship with loan status. It is negative here because Graduated = 0 and Eligible = 1. We notice a similar relationship between loan status and self employed, Not Self Employed = 0 and Eligible = 1 and they are negatively correlated.
Based on this information, we realize that applicant income (both their own and coapplicant income) and loan amount are not extremely meaningful on their own, and it will be useful in our modeling to have a feature that is a ratio of applicant total income to loan amount. Additionally, presence of a coapplicant may be important, so we will make a categorical variable highlighting if the applicant had a coapplicant or not.
# create new value for applicant income to loan amount
customer_info_cleaned['Applicant_Income(total yearly)_to_Loan_Amount(total)'] = (((customer_info_cleaned['ApplicantIncome_Monthly']*12) + (customer_info_cleaned['CoapplicantIncome_Monthly']*12)) / (customer_info_cleaned['LoanAmount_Thousands']*1000))
# create column denoting if there is a coapplicant through values 0 = N and 1 = Y
def coapplicant(data,column):
Coapplicant = []
for i in data[column]:
if i == 0:
Coapplicant.append(0)
else:
Coapplicant.append(1)
customer_info_cleaned['Coapplicant'] = Coapplicant
return data
coapplicant(customer_info_cleaned,'CoapplicantIncome_Monthly')
| Gender | Married | Dependents | Education | Self_Employed | ApplicantIncome_Monthly | CoapplicantIncome_Monthly | LoanAmount_Thousands | Loan_Amount_Term_Months | Credit_History | Property_Area | Loan_Status | Applicant_Income(total yearly)_to_Loan_Amount(total) | Coapplicant | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 0 | 0.0 | 5849 | 0.0 | 128.0 | 360.0 | 1.0 | 2 | 1 | 0.548344 | 0 |
| 1 | 1 | 1 | 1 | 0 | 0.0 | 4583 | 1508.0 | 128.0 | 360.0 | 1.0 | 0 | 0 | 0.571031 | 1 |
| 2 | 1 | 1 | 0 | 0 | 1.0 | 3000 | 0.0 | 66.0 | 360.0 | 1.0 | 2 | 1 | 0.545455 | 0 |
| 3 | 1 | 1 | 0 | 1 | 0.0 | 2583 | 2358.0 | 120.0 | 360.0 | 1.0 | 2 | 1 | 0.494100 | 1 |
| 4 | 1 | 0 | 0 | 0 | 0.0 | 6000 | 0.0 | 141.0 | 360.0 | 1.0 | 2 | 1 | 0.510638 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 609 | 0 | 0 | 0 | 0 | 0.0 | 2900 | 0.0 | 71.0 | 360.0 | 1.0 | 0 | 1 | 0.490141 | 0 |
| 610 | 1 | 1 | 3 | 0 | 0.0 | 4106 | 0.0 | 40.0 | 180.0 | 1.0 | 0 | 1 | 1.231800 | 0 |
| 611 | 1 | 1 | 1 | 0 | 0.0 | 8072 | 240.0 | 253.0 | 360.0 | 1.0 | 2 | 1 | 0.394245 | 1 |
| 612 | 1 | 1 | 2 | 0 | 0.0 | 7583 | 0.0 | 187.0 | 360.0 | 1.0 | 2 | 1 | 0.486610 | 0 |
| 613 | 0 | 0 | 0 | 0 | 1.0 | 4583 | 0.0 | 133.0 | 360.0 | 0.0 | 1 | 0 | 0.413504 | 0 |
573 rows × 14 columns
# drop ApplicantIncome_Monthly and LoanAmount_Thousands from df
customer_info_cleaned.drop(columns = ['ApplicantIncome_Monthly','CoapplicantIncome_Monthly','LoanAmount_Thousands'], inplace = True)
# understand correlations between all variables again to include engineered features
sns.set_theme(style="white")
# Compute the correlation matrix
corr = customer_info_cleaned.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(16, 12))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, cmap=cmap, mask=mask, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot= True)
<AxesSubplot:>
When rerunning the correlation heatmap with 'Applicant_Income(total yearly)_to_Loan_Amount(total)' and 'Coapplicant' we notice that the presence of a coapplicant does have a positive corrlation with 'Loan_Status. Meaning, as coapplicant value rises (1=coapplicant present), then so does loan status (1=Eligible).
# what factors were commonly present for those with 0 credit history, but who were eligible for a loan?
customer_info_cleaned.loc[(customer_info_cleaned['Credit_History'] == 0) & (customer_info_cleaned['Loan_Status'] == 1)]
| Gender | Married | Dependents | Education | Self_Employed | Loan_Amount_Term_Months | Credit_History | Property_Area | Loan_Status | Applicant_Income(total yearly)_to_Loan_Amount(total) | Coapplicant | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 122 | 0 | 0 | 0 | 0 | 0.0 | 360.0 | 0.0 | 1 | 1 | 0.973752 | 1 |
| 155 | 1 | 1 | 3 | 0 | 0.0 | 180.0 | 0.0 | 1 | 1 | 0.799980 | 0 |
| 201 | 1 | 0 | 2 | 0 | 0.0 | 360.0 | 0.0 | 1 | 1 | 0.355880 | 0 |
| 267 | 1 | 1 | 3 | 0 | 0.0 | 360.0 | 0.0 | 2 | 1 | 0.460800 | 1 |
| 326 | 1 | 0 | 0 | 0 | 0.0 | 360.0 | 0.0 | 0 | 1 | 0.453877 | 0 |
| 453 | 1 | 1 | 0 | 0 | 0.0 | 180.0 | 0.0 | 0 | 1 | 0.659867 | 1 |
| 527 | 1 | 1 | 1 | 1 | 0.0 | 360.0 | 0.0 | 1 | 1 | 0.500497 | 1 |
Some supporting insights from this group within our home applicant data include:
This suggests that if an applicant has no credit history, but is educated, not self employed, and has a coapplicant when needed (based on applicant's income to loan amount), then there is a chance they will be eligible for a loan.
customer_info_cleaned.to_csv('../data/train_cleaned.csv',index=False)
Based on my analysis...
People who are most eligible for home loans have a credit history, education, an employer that is not themselves, and a coapplicant.
# create function to drop any rows with nulls values for features where the number of null values is <3% of the total rows in our data
def drop_nas(df):
for feature in df.columns:
null_sum = df[feature].isnull().sum()
if null_sum < len(df) * .03 and null_sum != 0:
df.dropna(subset = [feature], inplace = True)
return df
# create function to encode object values using label_encoder
# leave null values for now until we impute their values
def column_encoder(data):
# initiate and define label_encoder
label_encoder = preprocessing.LabelEncoder()
# user label_encoder on features of interest
data['Gender'].loc[data['Gender'].notnull()] = label_encoder.fit_transform(data['Gender'].loc[data['Gender'].notnull()])
data['Married'].loc[data['Married'].notnull()] = label_encoder.fit_transform(data['Married'].loc[data['Married'].notnull()])
data['Dependents'].loc[data['Dependents'].notnull()] = label_encoder.fit_transform(data['Dependents'].loc[data['Dependents'].notnull()])
data['Education'].loc[data['Education'].notnull()] = label_encoder.fit_transform(data['Education'].loc[data['Education'].notnull()])
data['Self_Employed'].loc[data['Self_Employed'].notnull()] = label_encoder.fit_transform(data['Self_Employed'].loc[data['Self_Employed'].notnull()])
data['Property_Area'].loc[data['Property_Area'].notnull()] = label_encoder.fit_transform(data['Property_Area'].loc[data['Property_Area'].notnull()])
data['Loan_Status'].loc[data['Loan_Status'].notnull()] = label_encoder.fit_transform(data['Loan_Status'].loc[data['Loan_Status'].notnull()])
return data
# create column denoting if there is a coapplicant through values 0 = N and 1 = Y
def coapplicant(data,column):
Coapplicant = []
for i in data[column]:
if i == 0:
Coapplicant.append(0)
else:
Coapplicant.append(1)
data['Coapplicant'] = Coapplicant
return data
# create function to clean data to match process from train.csv EDA
# this function assumes we have checked that the test data has the same columns as our train
def clean_test(df):
test_clean_df = df.copy()
test_clean_df.drop(columns=['Loan_ID'], inplace=True)
# call drop_nas
drop_nas(test_clean_df)
# call column_encoder
column_encoder(test_clean_df)
# dtypes are still objects from column_encoder, change to int
columns = ['Gender','Married','Dependents','Education','Self_Employed','Property_Area','Loan_Status']
test_clean_df[columns] = test_clean_df[columns].apply(pd.to_numeric, errors='coerce')
# impute Self_Employed nulls to mode if applicable
self_employed_mode = test_clean_df['Self_Employed'].mode()
test_clean_df['Self_Employed'].fillna(value=self_employed_mode.values[0],inplace=True)
# impute Credit_History nulls to mode if applicable
credit_history_mode = test_clean_df['Credit_History'].mode()
test_clean_df['Credit_History'].fillna(value=credit_history_mode.values[0],inplace=True)
# impute LoanAmount nulls to median if applicable
loan_amount_median = test_clean_df['LoanAmount'].median()
test_clean_df['LoanAmount'].fillna(value=loan_amount_median,inplace=True)
# rename loan term column to include it's measurement - months
test_clean_df.rename(columns = {'Loan_Amount_Term':'Loan_Amount_Term_Months'}, inplace = True)
# new variable for applicant total income / loan amount
# create new value for applicant income to loan amount
test_clean_df['Applicant_Income(total yearly)_to_Loan_Amount(total)'] = (((test_clean_df['ApplicantIncome']*12) + (test_clean_df['CoapplicantIncome']*12)) / (test_clean_df['LoanAmount']*1000))
# call coapplicant
coapplicant(test_clean_df,'CoapplicantIncome')
# remove unecessary columns that are now represented in the two columns we just created
# drop ApplicantIncome_Monthly and LoanAmount_Thousands from df
test_clean_df.drop(columns = ['ApplicantIncome','CoapplicantIncome','LoanAmount'],inplace = True)
# other cleaning outside of EDA notebook activity to ensure values are valid
# check that Gender, Married, Education, Self Employed values are 0 or 1
columns = ['Gender','Married','Education','Self_Employed']
for feature in columns:
for i in test_clean_df[feature]:
test_clean_df.drop(test_clean_df.loc[(test_clean_df[feature] < 0) | (test_clean_df[feature] > 1)].index, inplace=True)
# drop values for Dependents outside of 0-3
test_clean_df.drop(test_clean_df.loc[(test_clean_df['Dependents'] < 0) | (test_clean_df['Dependents'] > 3)].index, inplace=True)
# drop values for Property_Area outside 0-2
test_clean_df.drop(test_clean_df.loc[(test_clean_df['Property_Area'] < 0) | (test_clean_df['Property_Area'] > 2)].index, inplace=True)
# drop Loan_Amount_Term < 0 and > 480 (40 years)
test_clean_df.drop(test_clean_df.loc[(test_clean_df['Loan_Amount_Term_Months'] < 0) | (test_clean_df['Loan_Amount_Term_Months'] > 480)].index, inplace=True)
# drop income / loan amount ratio < 0
test_clean_df.drop(test_clean_df.loc[test_clean_df['Applicant_Income(total yearly)_to_Loan_Amount(total)'] < 0].index, inplace=True)
return test_clean_df